from xinference.client import Client

client = Client("http://localhost:9997")
# model_uid = client.launch_model(model_name="chatglm2")
model_uid = client.launch_model(model_name="baichuan-2")
model = client.get_model(model_uid)

chat_history = []
prompt = "What is the largest animal?"
model.chat(
    prompt,
    chat_history,
    generate_config={"max_tokens": 1024}
)